##### Comparisons by differents states #####

library(tidyverse)
library(readxl)

edo_result <- read_excel("data/pres_elec_2006-2018_resultados_estados.xlsx") %>% 
  mutate(
    real_of_p = of_p*100,
    real_op_p = op_p*100,
    real_otro_p = otro_p*100,
    real_turnout_p = turnout_p*100,
    estado = case_when(
      estado == "MÉRIDA" ~ "MERIDA",
      TRUE ~ estado
    ),
    year = as.factor(year),
    year_edo = paste(year,estado, sep = "_")
  ) %>% select(year_edo, starts_with("real"))

summary(edo_result)

ven <- readRDS("ven_elec_2006_2024_final.rds")

ven_edo <- ven %>% 
   filter(!year == 2024) %>% 
   group_by(year, estado) %>% 
   summarise(of_p = weighted.mean(of_p, validos, na.rm = T),
             op_p = weighted.mean(op_p, validos, na.rm = T),
             otro_p = weighted.mean(otro_p, validos, na.rm = T),
             turnout_p = weighted.mean(turnout, rep_c, na.rm = T)) %>%
  mutate(year_edo = paste(year, estado, sep = "_")) %>% 
   left_join(., edo_result) %>% 
  mutate(
    dif_of = of_p - real_of_p,
    dif_op = op_p - real_op_p,
    dif_otro = otro_p - real_otro_p,
    dif_turnout = turnout_p - real_turnout_p
  )



dif_edo_summary <- ven_edo %>% 
  group_by(year) %>% 
  summarise(
    mean_dif_of = mean(dif_of, na.rm = TRUE),
    se_dif_of = sd(dif_of, na.rm = TRUE) / sqrt(sum(!is.na(dif_of))),
    
    mean_dif_op = mean(dif_op, na.rm = TRUE),
    se_dif_op = sd(dif_op, na.rm = TRUE) / sqrt(sum(!is.na(dif_op))),
    
    mean_dif_otro = mean(dif_otro, na.rm = TRUE),
    se_dif_otro = sd(dif_otro, na.rm = TRUE) / sqrt(sum(!is.na(dif_otro))),
    
    mean_dif_turnout = mean(dif_turnout, na.rm = TRUE),
    se_dif_turnout = sd(dif_turnout, na.rm = TRUE) / sqrt(sum(!is.na(dif_turnout)))
  )

dif_long_means <- dif_edo_summary %>%
  select(year, starts_with("mean")) %>%
  pivot_longer(cols = -year, names_to = "Metric", values_to = "Difference")

dif_long_ses <- dif_edo_summary %>%
  select(year, starts_with("se")) %>%
  pivot_longer(cols = -year, names_to = "Metric", values_to = "SE") %>%
  mutate(Metric = gsub("se_", "mean_", Metric))  # Align metric names with the mean table

# Merge means and SEs
dif_edo_differences_long <- left_join(dif_long_means, dif_long_ses, by = c("year", "Metric"))

 
 # Define colors for each variable
 color_palette <- c(
   "mean_dif_of" = "red",
   "mean_dif_op" = "blue",
   "mean_dif_otro" = "purple",
   "mean_dif_turnout" = "darkgreen"
 )
 
 # Define shape mappings
 shape_palette <- c(
   "mean_dif_of" = 16,    # Chavismo (solid circle)
   "mean_dif_op" = 17,    # Opposition (triangle)
   "mean_dif_otro" = 15,  # Other Candidates (diamond)
   "mean_dif_turnout" = 5 # Turnout (square)
 )
 
 # Plot the differences over time using shapes
 # Define position dodge
 dodge_width <- position_dodge(width = 0.4)
 
 sample_real_data <- ggplot(dif_edo_differences_long, aes(x = year, y = Difference, shape = Metric)) +
   geom_point(size = 2, position = dodge_width) +
   geom_errorbar(
     aes(ymin = Difference - 1.96 * SE,
         ymax = Difference + 1.96 * SE),
     width = 0.2,
     position = dodge_width
   ) +
   scale_shape_manual(
     values = shape_palette,
     labels = c("Chavismo", "Opposition", "Other Candidates", "Turnout")
   ) +
   geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
   theme_minimal() +
   labs(
     title = "Figure 5: Comparison of sample data vs. real electoral data",
     subtitle = "Mean differences in vote share and turnout across elections",
     x = "",
     y = "Mean Difference (pp)",
     shape = "",
     caption = "Source: Compiled by the authors. Note: Dashed line represents zero difference. Values close \nto zero indicate minimal bias in the sample. Real electoral data obtained from Wikipedia \npages and public tallies in the press. Error bars represent 95% confidence intervals."
   ) +
   scale_y_continuous(limits = c(-2, 2)) +
   theme_bw() +
   theme(
     axis.title.y = element_text(vjust = 2, size = 12),
     axis.text.x = element_text(size = 10),
     legend.text = element_text(size = 10),
     title = element_text(size = 14),
     plot.caption = element_text(hjust = 0, face = "italic")
   )
 
 sample_real_data

ggsave("sample_vs_real_data.jpg", sample_real_data, width = 10, height = 6) 


# Compute standard deviations of differences
sd_edo_summary <- ven_edo %>%
  group_by(year) %>%
  summarise(
    sd_dif_of = sd(dif_of, na.rm = TRUE),
    sd_dif_op = sd(dif_op, na.rm = TRUE),
    sd_dif_otro = sd(dif_otro, na.rm = TRUE),
    sd_dif_turnout = sd(dif_turnout, na.rm = TRUE)
  )

sd_edo_differences_long <- sd_edo_summary %>%
  pivot_longer(cols = -year, names_to = "Metric", values_to = "SD")

# Define shape palette again
shape_palette <- c(
  "sd_dif_of" = 16,
  "sd_dif_op" = 17,
  "sd_dif_otro" = 15,
  "sd_dif_turnout" = 5
)

# Plot standard deviations
plot_sd_diff <- ggplot(sd_edo_differences_long, aes(x = year, y = SD, shape = Metric)) +
  geom_point(size = 2, position = position_dodge(width = 0.4)) +
  scale_shape_manual(
    values = shape_palette,
    labels = c("Chavismo", "Opposition", "Other Candidates", "Turnout")
  ) +
  theme_minimal() +
  labs(
    title = "Standard Deviation of Sample vs. Real Data Differences",
    subtitle = "By Election Year and Political Metric",
    x = "",
    y = "Standard Deviation of Differences (pp)",
    shape = "",
    caption = "Higher values indicate more variation between sample data and real electoral data across states."
  ) +
  theme_bw() +
  scale_y_continuous(limits = c(0, 2)) +
  theme(
    axis.title.y = element_text(vjust = 2, size = 12),
    axis.text.x = element_text(size = 10),
    legend.text = element_text(size = 10),
    title = element_text(size = 14),
    plot.caption = element_text(hjust = 0, face = "italic")
  )

plot_sd_diff


 